#!/usr/bin/Rscript
##########################################################################################
# Social Media and Policy Responses to the COVID-19 Pandemic in Switzerland
##########################################################################################
# Description:
##########################################################################################
# Preparation of CdT Data to add to the SMD Data later
##########################################################################################
# Contents
##########################################################################################
# 1) Dependencies
# 2) Preparations
# 3) Calculate Sentiment
# 4) Topic / Classification
# 5) Merge the two data sets
##########################################################################################
# 1) Dependencies
##########################################################################################
#Libraries
suppressPackageStartupMessages(library(tools))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(quanteda))
suppressPackageStartupMessages(library(quantreg))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(glue))
suppressPackageStartupMessages(library(pbmcapply))
suppressPackageStartupMessages(library(urltools))
suppressPackageStartupMessages(library(cld2))
suppressPackageStartupMessages(library(elastic))
suppressPackageStartupMessages(library(rjson))
suppressPackageStartupMessages(library(jsonlite))
##########################################################################################
# 2) Preparations
##########################################################################################
rm(list=ls())
# - set dir
args = commandArgs()

scriptName = args[substr(args,1,7) == '--file=']

if (length(scriptName) == 0) {
  scriptName <- rstudioapi::getSourceEditorContext()$path
} else {
  scriptName <- substr(scriptName, 8, nchar(scriptName))
}

pathName = substr(
  scriptName, 
  1, 
  nchar(scriptName) - nchar(strsplit(scriptName, '.*[/|\\]')[[1]][2])
)

setwd(pathName)
parent_path <- getwd()

setwd("~/Downloads/corrieredelticino")
path2<- getwd()

# Get all json-files
temp = list.files(path2, pattern = "*.json")

# Read files and combine them to a df!
df_pr_a <- purrr::map_df(temp, function(x) { 
  purrr::map(jsonlite::fromJSON(x), function(y) ifelse(is.null(y), NA, y)) 
})

setwd(path)
getwd()
write_rds(df_pr_a, "../data/CdT_data.RDS")
##########################################################################################
# 3) Calculate Sentiment
##########################################################################################
df_pr_a <- read_rds("../data/CdT_data.RDS")

# Load Function Data:
# Load Dictionary from Proksch et al. Multilingual Sentiment analysis (based on the Lexicoder Dictionary)
load(paste0("../lib/auto_dictionaries_lsd.RData"))
dictionaries <- load(paste0("../lib/auto_dictionaries_lsd.RData"))

# Load improved Lexicons
load(paste0("../lib/lsde_frenche_germane.RData"))
dictionaries_2 <- load(paste0("../lib/lsde_frenche_germane.RData"))

#Combine Vector
dictionaries <- c(dictionaries, dictionaries_2)
rm(dictionaries_2)
gc()

# - text Cleaner Function:
text_df <- df_pr_a$Text
len_l <- length(text_df)
gc()
for(k in 1:ceiling(len_l/10000)){
  if(k == 1){
    min_k <- 1
    max_k <- 10000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 10000
  }
  if(max_k > len_l){max_k <- len_l}
  
  text_tmp_k <- pbmclapply(min_k:max_k, function(l){
    # Get Sentences in order:
    tx_new <- gsub('(?<=[a-z])\\.(?=[A-Z\\(])', '. ', text_df[l], perl = T)
    # Remove URL's
    tx_new <- gsub('http\\S+\\s*', '', tx_new, perl = T)
    # Remove Numbers:
    tx_new <- gsub('http[[:alnum:]]*', '', tx_new, perl = T)
    # Remove leading and trailing whitespaces:
    tx_new <- gsub('^[[:space:]]*', '', tx_new, perl = T)
    tx_new <- gsub('[[:space:]]*$', '', tx_new, perl = T)
    # Remove weired quotation marks and other signs quanteda does not like:
    tx_new <- gsub('|\\»|\\«|\\}|\\{', '', tx_new)
    tx_new <- gsub('\\–', '. ', tx_new)
    tx_new <- gsub('\\{|\\}|\\[|\\]|\\(|\\)', '', tx_new)
    tx_new <- tolower(tx_new)
    
    tx_new <- as.character(tx_new)
  }, mc.cores = 8)
  
  text_tmp_k <- unlist(text_tmp_k)
  if(k == 1){
    text_tmp <- text_tmp_k
  } else {
    text_tmp <- c(text_tmp, text_tmp_k)
  }
  gc() 
}
df_pr_a$text <- text_tmp
rm(len_l, text_tmp, text_df,min_k,max_k,text_tmp_k,k)

df_pr_a$la <- "it"


write_rds(df_pr_a, "../data/CdT_data.RDS")
df_pr_a <- read_rds("../data/CdT_data.RDS")

# Split large Data in smaller chunks:
nr <- nrow(df_pr_a)
n <- 10000
dflist <- split(df_pr_a, rep(1:ceiling(nr/n), each=n, length.out=nr))
df_pr_a <- data.frame()
gc()

#Sentiment Calculator:
final_dat <- function(data, dictionarieslist = dictionaries, quant_nodes = 4, nodes = 4){
  quanteda_options(threads = quant_nodes)
  #########################################################
  # Add Sentiment Column:
  data$sentiment_value <- NA
  data$positive_words <- NA
  data$negative_words <- NA
  cat("Make sure you have loaded the 'auto_dictionaries_lsd.RData' and 'lsde_frenche_germane.RData' in the environment.\nWithout them the function will not work\n")
  # Split data by Languages:
  data_it <- data[data$la == "it", ]
  #Clear Mem:
  rm(data)
  gc()
  
  cat("Prepocessing of Articles done! Starting Sentiment Analysis!\n")
  #########################################################
  # Allocate Memory
  numit <- nrow(data_it)
  ################                 
  # Process Italian
  protex <- corpus(data_it[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_it"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_it$sentiment_value <- sentisave$value
  data_it$positive_words <-  sentisave$Pos_Words
  data_it$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  
  # Combine all frames again:
  data <- data_it
  rm(data_it)
  gc()
  return(data)
}

for(t in 1:length(dflist)){
  tmp <- final_dat(dflist[[t]], quant_nodes = 4) 
  if(t == 1){
    df_pr_a <- tmp
    rm(tmp)
  } else {
    df_pr_a <- dplyr::bind_rows(df_pr_a,tmp)
    rm(tmp)
  }
  print(paste0(t," th Element of list processed!\n"))
  gc()
}

df_pr_a$sentiment_value <- as.numeric(df_pr_a$sentiment_value)
df_pr_a <- df_pr_a %>% dplyr::select(-c(text))
rm(dflist)
gc()

write_rds(df_pr_a, "../data/CdT_data.RDS")
df_pr_a <- readRDS("../data/CdT_data.RDS")
##########################################################################################
# 4) Topic / Classification
##########################################################################################
# Covid 19 (Yes / No):
search_pattern_covid <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('corona','covid19','coronaschweiz','coronach','coronavirus','coronavirusschweiz','epidemie',
    'social distancing', 'coronatests', 'pandemie', 'corona-pandemie',
    'coronakrise','covid19ch','covidch','bag_ofsp_ufsp','coronainfoch', 'swisscovid',
    'pandemie', 'covid', 'coronakrise','swiss-covid-app', 'coronapandemie',
    'corona-sommer', 'covid-19-erkrankungen', 'corona-kredit', 'corona-infektionen',
    'lockdown', 'schutzmaske','beatmungsgerät', 'beatmungsgeräte','pandémie',
    'masques', 'crise sanitaire', 'covid-19', 'sars-cov-2', 'coronagraben', 'swisscovid',
    'coronavirus', 'covid', 'epidémie', 'social distancing', 'garder ses distances',
    'maske','contact tracing', 'masquer', 'maschera', 'respirator', 'hygienemaske', 'ffp2', 
    'atemschutz', 'swisscovid','covidioten','neuinfektionen','hospitalisierungsrate',
    'covidapp','coronaapp','swiss-covid-app', 'contact-tracing-app','dp-3t','swisscovidapp',
    'epidemiologisch','antikörper', 'maskenpflicht','maskenzwang','maskenwahn','herdenimunität',
    'coronawarnapp', 'contact-tracing', 'contact tracing', 'besondere lage', 'ausserordendliche lage',
    'swisscovid-app', 'corona-app','covid-codes','corona app', 'corona warn app',' contact tracing app', 
    'kontakt verfolgungs app', 'kontakt rückverfolgung'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_pr_a$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:round(len_l/10000)){
  if(k == 1){
    min_k <- 1
    max_k <- 10000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 10000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covid))
  },mc.cores = 12)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}

df_pr_a$iscovid19_txt <- text_tmp
df_pr_a$topic_0 <- ifelse(df_pr_a$iscovid19_txt == 1, "COVID19", "Anderes")
rm(len_l, text_tmp, df_txt_a,min_k,max_k,df_pr_l,k)


# Mask Topic:
search_pattern_mask <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('schutzmaske', 'masques', 'maske','masquer', 'maschera', 'hygienemaske', 'ffp2', 'mascherina', 'masken', 'atemschutzmasken',
    'atemschutz', 'maskenpflicht','maskenzwang','maskenwahn', 'mundnasenschutz', 'mund-nasen-schutz', 'gesichtsschutz', 'masque'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_pr_a$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/10000)){
  if(k == 1){
    min_k <- 1
    max_k <- 10000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 10000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_mask))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_pr_a$ismask_txt <- text_tmp
df_pr_a$topic_1 <- ifelse(df_pr_a$ismask_txt == 1, "Mask", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)

# CovidApp Topic:
search_pattern_covidapp <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('swiss-covid-app','covidapp','coronaapp','swiss covid app', 'contact-tracing-app','dp-3t','swisscovidapp','swisscovid-app', 
    'corona-app','covid-codes', 'contact tracing', 'swiss-covid-app',
    'coronawarnapp', 'corona app', 'corona warn app', 'contact tracing app', 'kontaktverfolgungsapp', 'kontakt-verfolgungs-app', 
    'swisscovid', 'covidcodes', 'covid codes', 'corona app', 'dp3t', 'dp 3t'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_pr_a$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/10000)){
  if(k == 1){
    min_k <- 1
    max_k <- 10000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 10000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covidapp))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_pr_a$iscovidapp_txt <- text_tmp
df_pr_a$topic_2 <- ifelse(df_pr_a$iscovidapp_txt == 1, "CovidApp", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)

search_pattern_covidapp_2 <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('swiss-covid-app','covidapp','coronaapp','swiss covid app', 'contact-tracing-app','dp-3t','swisscovidapp','swisscovid-app', 'corona-app','covid-codes', 'contact tracing', 'swiss-covid-app',
    'coronawarnapp', 'corona app', 'corona warn app', 'contact tracing app', 'kontaktverfolgungsapp', 'kontakt-verfolgungs-app', 'swisscovid', 'covidcodes', 'covid codes', 'corona app', 'dp3t', 'dp 3t'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

search_pattern_covidapp_2 <- paste(search_pattern_covidapp_2, "|\\bapp\\b|\\bapps\\b")


df_txt_a <- df_pr_a$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covidapp_2))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_pr_a$iscovidapp_txt_2 <- text_tmp
df_pr_a$topic_2_2 <- ifelse(df_pr_a$iscovid19_txt == 0, "Anderes", 
                           ifelse(df_pr_a$iscovidapp_txt_2 == 1, "CovidApp", "Anderes"))
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_pr_a, "../data/CdT_data_new.RDS")


df_pr_a$topic <- ifelse(df_pr_a$ismask_txt == 1 & df_pr_a$topic_2_2 == "CovidApp", "App & Masks", 
                        ifelse(df_pr_a$topic_2_2 == "CovidApp", "App", 
                               ifelse(df_pr_a$ismask_txt == 1, "Masks",
                                      ifelse(df_pr_a$iscovid19_txt == 1, "Covid19", "Anderes"))))

df_pr_a %>% group_by(topic) %>% summarise(n = n())

#Remove this one article without any text...
df_pr_a <- df_pr_a %>% filter(is.na(topic)==F)

write_rds(df_pr_a, "../data/CdT_data.RDS")
##########################################################################################
# 5) Merge the two data sets
##########################################################################################
rm(df_pr_a,extendeddict_bg,extendeddict_cs,extendeddict_da,extendeddict_de,extendeddict_de_e,
   extendeddict_el,extendeddict_en,extendeddict_es,extendeddict_et,extendeddict_fi,extendeddict_fr,
   extendeddict_fr_e,extendeddict_hu,extendeddict_it,extendeddict_lt,extendeddict_lv,extendeddict_nl)

cdt <- read_rds("../data/CdT_data.RDS")
smd <- read_rds("../data/SMD_data.RDS")


names(cdt)
names(smd)

colnames(cdt) <- c("pubDateTime","Kürzel","Akteur","Akteur Typ","ht","tx","url","Quelle","Type",
                   "la","sentiment_value","positive_words","negative_words","iscovid19_txt",
                   "topic_0","ismask_txt","topic_1","iscovidapp_txt","topic_2","topic","iscovidapp_txt_2","topic_2_2")

cdt <- cdt %>% dplyr::rename(topic_covid = topic_0, topic_mask = topic_1, topic_app = topic_2_2, topic_app_small = topic_2)
smd <- smd %>% dplyr::rename(topic_covid = topic_0, topic_mask = topic_1, topic_app = topic_2_2, topic_app_small = topic_2)

names(cdt)
cdt <- cdt %>% select(c("pubDateTime","ht","tx","url","la","sentiment_value","positive_words","negative_words","iscovid19_txt",
                        "topic_0","ismask_txt","topic_1","iscovidapp_txt","topic_2","iscovidapp_txt_2","topic_2_2","topic")) %>% 
               mutate(so_txt = "Correire del Ticino",
                      so = "CdT",
                      pubDateTime = as.Date(pubDateTime))

names(cdt)

smd_cdt <- dplyr::bind_rows(smd,cdt)
smd_cdt <- smd_cdt %>% filter(pubDateTime > as.Date("2019-11-30"))
write_rds(smd_cdt, "../data/SMD_CDT_data.RDS")

smd_cdt <- read_rds("../data/SMD_CDT_data.RDS")
names(smd_cdt)


smd_cdt <- smd_cdt %>% dplyr::rename(topic_covid = topic_0, topic_mask = topic_1, topic_app = topic_2_2, topic_app_small = topic_2)
names(smd_cdt)

write_rds(smd_cdt, "../data/SMD_CDT_data.RDS")

smd_cdt <- smd_cdt %>% dplyr::select(c("so","so_txt","pubDateTime","la","sentiment_value","iscovid19_txt",
                                       "topic","ismask_txt","topic_mask","iscovidapp_txt","topic_app_small",
                                       "topic_covid","iscovidapp_txt_2","topic_app","selectsclass"))

write_rds(smd_cdt, "../data/SMD_CDT_data_minified.RDS")
